-Manipulation and cleaning data.
-Examine the app category share in the platform based on the number of installs.
-Making a heat map to see the relationship between Rating and size of the app and price.
-Looking at app price distribution with and after removing overlays.
import pandas as pd
import numpy as np
import plotly.express as px
df=pd.read_csv("googleplaystore.csv")
df.head()
print("The number of duplicate we have is:",len(df) -len(df.drop_duplicates()))
df.drop_duplicates(inplace=True)
df.info()
print('The columns with null values are: Rating - Type - Content Rating - Current Ver - Android Ver.')
print("We can see alot of null values in the Rating column:")
df.isnull().sum()
========================================================================================================================
df[["Price","Installs"]].head()
df[df['Price']!="0"][["Price","Installs"]].head()
for i in ["Price","Installs"]:
for z in ["$",",","+"]:
df[i]=df[i].str.replace(z,"")
df[df['Price']!="0"][["Price","Installs"]].head()
df["Price"]=df["Price"].str.replace("Everyone","")
df["Installs"]=df["Installs"].str.replace("Free","")
df["Price"]=pd.to_numeric(df["Price"])
df["Installs"]=pd.to_numeric(df["Installs"])
========================================================================================================================
df.head(2)
fig = px.sunburst(df, path=['Category', 'Genres'], values='Installs',title='Hierarchy of apps')
fig.show()
df['Reviews']=pd.to_numeric(df['Reviews'].str.replace(".0M","000000"))
arr=[]
for i in df['Size']:
if "M" in i:
i=i.replace("M","")
elif "k" in i:
i=float(i.replace("k",""))*10**-3
elif "Varies with device" in i:
i=""
elif "+" in i:
i=i.replace("+","")
elif "," in i:
i=i.str.replace(",","")
arr.append(i)
arr=pd.Series(arr)
df['Size']=pd.to_numeric(arr.str.replace(",",""))
f_cat_count=df.groupby('Category').agg({"App":"count","Rating":"mean","Reviews":"sum","Size":"sum","Installs":"sum"}).reset_index().sort_values("App",ascending=False).reset_index(drop=True)
f_cat_count.head()
f_cat_count['Reviews']=round(f_cat_count['Reviews']/10**6,2)
f_cat_count.rename(columns = {'Reviews':'Reviews_in_M'}, inplace = True)
f_cat_count['Installs']=round(f_cat_count['Installs']/10**9,2)
f_cat_count.rename(columns = {'Installs':'Installs_in_B'}, inplace = True)
f_cat_count['Size']=round(f_cat_count['Size']/10**3,1)
f_cat_count.rename(columns = {'Size':'Size_in_G'}, inplace = True)
f_cat_count['Rating']=round(f_cat_count['Rating'],2)
f_cat_count.index=f_cat_count.index+1
f_cat_count.head()
fig = px.bar(f_cat_count, x='Category', y='App',hover_data=['Rating', 'Reviews_in_M',"Size_in_G","Installs_in_B"],
title='Apps count of each category')
fig.update_layout(xaxis_tickangle=-45)
fig.show()
=========================================================================
df.head(2)
df2=df.dropna()
fig = px.density_heatmap(df2, x="Size", y="Rating", marginal_x="histogram", marginal_y="histogram",
title='The relation between the size and the rating score of apps')
fig.show()
fig = px.density_heatmap(df2, x="Price", y="Rating", marginal_x="histogram", marginal_y="histogram",
title='The relation between the size and the price score of apps')
fig.show()
df2.head()
px.strip(df2[df2['Category'].isin(list(f_cat_count['Category'][:15]))],
category_orders={"Category":list(f_cat_count['Category'][:15])},
x="Price", y="Category",color="Type",hover_data=['App',"Rating"],
title='The distribution of the price for each app category')
px.strip(df2[(df2['Category'].isin(list(f_cat_count['Category'][:15]))) & (df2['Price']<=100)],
category_orders={"Category":list(f_cat_count['Category'][:15])},
x="Price", y="Category",color="Type",hover_data=['App',"Rating"],
title='The distribution of the price for each app category without outliers')
px.box(df2,y='Installs',color='Type',log_y=True,hover_data=["App"],
title='Distribution of app installed with the y-axis being in logarithmic scale for ease of visualization')
reviews_df = pd.read_csv("googleplaystore_user_reviews.csv")
df3=pd.merge(df2, reviews_df, on = 'App', how = "inner").dropna()
df3.head()
px.box(df3,y='Sentiment_Polarity',color='Type',hover_data=["App","Translated_Review"],
title='The distribution of rating sentiment polarity of apps')
df3.describe(include='all')
df3["Content Rating"].replace("Everyone 10+","Everyone",inplace=True)
px.box(df3[df3['Category'].isin(['GAME', 'FAMILY', 'HEALTH_AND_FITNESS', 'DATING', 'PRODUCTIVITY'])],
y='Sentiment_Polarity',color='Type',hover_data=["App","Translated_Review"],
facet_col='Content Rating',facet_row="Category",height=1000,
category_orders={"Content Rating": ["Everyone", "Teen", "Mature 17+"]},
title='The distribution of rating sentiment polarity of apps based on Content rating and top 5 app category with the highest number of apps')